Kapitel 6.3: Gattungen¶
Das Notebook ergänzt Kapitel 6.3 'Gattungen'.
Import¶
In [1]:
import pandas as pd
import numpy as np
from resources_statistics import *
from resources_geschichtslyrik import *
import plotly.express as px
import plotly.graph_objects as go
from plotly.validators.scatter.marker import SymbolValidator
from tqdm.notebook import tqdm
In [2]:
meta = pd.read_json(r"../resources/meta.json")
Korpora¶
In [3]:
meta_anth = (
meta
.query("corpus=='anth'")
.query("1850 <= year <= 1918")
.query("geschichtslyrik == 1")
.drop_duplicates(subset='author_title')
)
meta_anth_bin = binarize_meta(meta_anth)
In [4]:
modcanon_authors = ['Hofmannsthal, Hugo von', 'Rilke, Rainer Maria', 'George, Stefan', 'Heym, Georg']
meta_modcanon = (
meta
.query("author in @modcanon_authors")
.query("1850 <= year <= 1918")
.query("geschichtslyrik == 1")
.drop_duplicates(subset='author_title')
)
In [5]:
muench_authors = ['Münchhausen, Börries von', 'Miegel, Agnes', 'Strauß und Torney, Lulu von']
meta_muench = (
meta
.query("author in @muench_authors")
.query("1850 <= year <= 1918")
.query("geschichtslyrik == 1")
.drop_duplicates(subset='author_title')
)
In [6]:
sub_df = pd.DataFrame()
sub_names = ['Anthologien', 'Kanonisierte Moderne', 'Münchhausen-Kreis']
sub_metas = [meta_anth, meta_modcanon, meta_muench]
Gattungshäufigkeit – Korpora¶
In [7]:
for this_name, this_meta in zip(sub_names, sub_metas):
sub_df.loc[this_name, 'Jahr'] = round(this_meta['year'].mean(), 0)
balladen = this_meta.query("gattung.str.contains('Ballade', na = False)").shape[0]
rollengedichte = this_meta.query("gattung.str.contains('Rollengedicht', na = False)").shape[0]
denkmalgedichte = this_meta.query("gattung.str.contains('Denkmal', na = False)").shape[0]
lieder = this_meta.query("gattung.str.contains('Lied', na = False)").shape[0]
sonette = this_meta.query("gattung.str.contains('Sonett', na = False)").shape[0]
keine_gattung = this_meta.query("gattung.isna()").shape[0]
mehrere_gattungen = this_meta.query("gattung.str.contains('+', na = False, regex = False)").shape[0]
sub_df.loc[this_name, 'Texte'] = this_meta.shape[0]
sub_df.loc[this_name, 'Balladen'] = balladen
sub_df.loc[this_name, 'Balladen (Anteil)'] = balladen/this_meta.shape[0]
sub_df.loc[this_name, 'Rollengedichte'] = rollengedichte
sub_df.loc[this_name, 'Rollengedichte (Anteil)'] = rollengedichte/this_meta.shape[0]
sub_df.loc[this_name, 'Denkmal-/Ruinenpoesie'] = denkmalgedichte
sub_df.loc[this_name, 'Denkmal-/Ruinenpoesie (Anteil)'] = denkmalgedichte/this_meta.shape[0]
sub_df.loc[this_name, 'Lieder'] = lieder
sub_df.loc[this_name, 'Lieder (Anteil)'] = lieder/this_meta.shape[0]
sub_df.loc[this_name, 'Sonette'] = sonette
sub_df.loc[this_name, 'Sonette (Anteil)'] = sonette/this_meta.shape[0]
sub_df.loc[this_name, 'Keine Gattung'] = keine_gattung
sub_df.loc[this_name, 'Keine Gattung (Anteil)'] = keine_gattung/this_meta.shape[0]
sub_df.loc[this_name, 'Mehrere Gattungen'] = mehrere_gattungen
sub_df.loc[this_name, 'Mehrere Gattungen (Anteil)'] = mehrere_gattungen/this_meta.shape[0]
handlung = this_meta.query("inhaltstyp.str.contains('Ereignis')").shape[0]
zustand = this_meta.query("inhaltstyp.str.contains('Zustand')").shape[0]
sub_df.loc[this_name, 'Ereignis'] = handlung
sub_df.loc[this_name, 'Ereignis (Anteil)'] = handlung/this_meta.shape[0]
sub_df.loc[this_name, 'Zustand'] = zustand
sub_df.loc[this_name, 'Zustand (Anteil)'] = zustand/this_meta.shape[0]
erzaehlen = this_meta.query("sprechakte.str.contains('Erzählen')").shape[0]
beschreiben = this_meta.query("sprechakte.str.contains('Beschreiben')").shape[0]
behaupten = this_meta.query("sprechakte.str.contains('Behaupten')").shape[0]
auffordern = this_meta.query("sprechakte.str.contains('Auffordern')").shape[0]
fragen = this_meta.query("sprechakte.str.contains('Fragen')").shape[0]
sub_df.loc[this_name, 'Erzählen'] = erzaehlen
sub_df.loc[this_name, 'Erzählen (Anteil)'] = erzaehlen/this_meta.shape[0]
sub_df.loc[this_name, 'Beschreiben'] = beschreiben
sub_df.loc[this_name, 'Beschreiben (Anteil)'] = beschreiben/this_meta.shape[0]
sub_df.loc[this_name, 'Behaupten'] = behaupten
sub_df.loc[this_name, 'Behaupten (Anteil)'] = behaupten/this_meta.shape[0]
sub_df.loc[this_name, 'Auffordern'] = auffordern
sub_df.loc[this_name, 'Auffordern (Anteil)'] = auffordern/this_meta.shape[0]
sub_df.loc[this_name, 'Fragen'] = fragen
sub_df.loc[this_name, 'Fragen (Anteil)'] = fragen/this_meta.shape[0]
In [8]:
round(sub_df, 2)
Out[8]:
| Jahr | Texte | Balladen | Balladen (Anteil) | Rollengedichte | Rollengedichte (Anteil) | Denkmal-/Ruinenpoesie | Denkmal-/Ruinenpoesie (Anteil) | Lieder | Lieder (Anteil) | ... | Erzählen | Erzählen (Anteil) | Beschreiben | Beschreiben (Anteil) | Behaupten | Behaupten (Anteil) | Auffordern | Auffordern (Anteil) | Fragen | Fragen (Anteil) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Anthologien | 1875.0 | 1850.0 | 1036.0 | 0.56 | 237.0 | 0.13 | 73.0 | 0.04 | 142.0 | 0.08 | ... | 1420.0 | 0.77 | 526.0 | 0.28 | 345.0 | 0.19 | 141.0 | 0.08 | 35.0 | 0.02 |
| Kanonisierte Moderne | 1903.0 | 113.0 | 10.0 | 0.09 | 18.0 | 0.16 | 3.0 | 0.03 | 0.0 | 0.00 | ... | 57.0 | 0.50 | 72.0 | 0.64 | 30.0 | 0.27 | 4.0 | 0.04 | 9.0 | 0.08 |
| Münchhausen-Kreis | 1905.0 | 140.0 | 76.0 | 0.54 | 33.0 | 0.24 | 1.0 | 0.01 | 7.0 | 0.05 | ... | 108.0 | 0.77 | 53.0 | 0.38 | 25.0 | 0.18 | 5.0 | 0.04 | 4.0 | 0.03 |
3 rows × 30 columns
Gattungshäufigkeit – Zeitverlauf¶
In [9]:
ts = pd.DataFrame()
ts.index = pd.Series(range(1850, 1919), name = 'year')
In [10]:
ts['text_count'] = meta_anth.groupby('year').size()
ts['text_count'] = ts['text_count'].fillna(0)
ts['text_sum'] = smooth(ts['text_count'], mode = 'sum')
ts['text_nomuench_count'] = [meta_anth.query("year == @x and author not in @muench_authors").shape[0] for x in ts.index]
ts['text_nomuench_sum'] = smooth(ts['text_nomuench_count'], mode = 'sum')
In [11]:
ts['ballade_count'] = [meta_anth.query("year == @x and gattung.str.contains('Ballade')").shape[0] for x in ts.index]
ts['ballade_sum'] = smooth(ts['ballade_count'], mode = 'sum')
ts['ballade_share_smoothed'] = ts['ballade_sum']/ts['text_sum']
ts['ballade_share_smoothed_low'] = [proportion_confint(x, y, 0.1)[0] for x, y in zip(ts['ballade_sum'], ts['text_sum'])]
ts['ballade_share_smoothed_high'] = [proportion_confint(x, y, 0.1)[1] for x, y in zip(ts['ballade_sum'], ts['text_sum'])]
ts['ballade_nomuench_count'] = [meta_anth.query("year == @x and gattung.str.contains('Ballade') and author not in @muench_authors").shape[0] for x in ts.index]
ts['ballade_nomuench_sum'] = smooth(ts['ballade_nomuench_count'], mode = 'sum')
ts['ballade_nomuench_share_smoothed'] = ts['ballade_nomuench_sum']/ts['text_nomuench_sum']
ts['ballade_nomuench_share_smoothed_low'] = [proportion_confint(x, y, 0.1)[0] for x, y in zip(ts['ballade_nomuench_sum'], ts['text_nomuench_sum'])]
ts['ballade_nomuench_share_smoothed_high'] = [proportion_confint(x, y, 0.1)[1] for x, y in zip(ts['ballade_nomuench_sum'], ts['text_nomuench_sum'])]
ts['rollengedicht_count'] = [meta_anth.query("year == @x and gattung.str.contains('Rollengedicht')").shape[0] for x in ts.index]
ts['rollengedicht_sum'] = smooth(ts['rollengedicht_count'], mode = 'sum')
ts['rollengedicht_share_smoothed'] = ts['rollengedicht_sum']/ts['text_sum']
ts['rollengedicht_share_smoothed_low'] = [proportion_confint(x, y, 0.1)[0] for x, y in zip(ts['rollengedicht_sum'], ts['text_sum'])]
ts['rollengedicht_share_smoothed_high'] = [proportion_confint(x, y, 0.1)[1] for x, y in zip(ts['rollengedicht_sum'], ts['text_sum'])]
ts['lied_count'] = [meta_anth.query("year == @x and gattung.str.contains('Lied')").shape[0] for x in ts.index]
ts['lied_sum'] = smooth(ts['lied_count'], mode = 'sum')
ts['lied_share_smoothed'] = ts['lied_sum']/ts['text_sum']
ts['lied_share_smoothed_low'] = [proportion_confint(x, y, 0.1)[0] for x, y in zip(ts['lied_sum'], ts['text_sum'])]
ts['lied_share_smoothed_high'] = [proportion_confint(x, y, 0.1)[1] for x, y in zip(ts['lied_sum'], ts['text_sum'])]
ts['sonett_count'] = [meta_anth.query("year == @x and gattung.str.contains('Sonett')").shape[0] for x in ts.index]
ts['sonett_sum'] = smooth(ts['sonett_count'], mode = 'sum')
ts['sonett_share_smoothed'] = ts['sonett_sum']/ts['text_sum']
ts['denkmal_count'] = [meta_anth.query("year == @x and gattung.str.contains('Denkmal')").shape[0] for x in ts.index]
ts['denkmal_sum'] = smooth(ts['denkmal_count'], mode = 'sum')
ts['denkmal_share_smoothed'] = ts['denkmal_sum']/ts['text_sum']
ts['nogattung_count'] = [meta_anth.query("year == @x and gattung.isna()").shape[0] for x in ts.index]
ts['nogattung_sum'] = smooth(ts['nogattung_count'], mode = 'sum')
ts['nogattung_share_smoothed'] = ts['nogattung_sum']/ts['text_sum']
ts['nogattung_share_smoothed_low'] = [proportion_confint(x, y, 0.1)[0] for x, y in zip(ts['nogattung_sum'], ts['text_sum'])]
ts['nogattung_share_smoothed_high'] = [proportion_confint(x, y, 0.1)[1] for x, y in zip(ts['nogattung_sum'], ts['text_sum'])]
ts['ereignis_count'] = [meta_anth.query("year == @x and inhaltstyp.str.contains('Ereignis')").shape[0] for x in ts.index]
ts['ereignis_sum'] = smooth(ts['ereignis_count'], mode = 'sum')
ts['ereignis_share_smoothed'] = ts['ereignis_sum']/ts['text_sum']
ts['ereignis_share_smoothed_low'] = [proportion_confint(x, y, 0.1)[0] for x, y in zip(ts['ereignis_sum'], ts['text_sum'])]
ts['ereignis_share_smoothed_high'] = [proportion_confint(x, y, 0.1)[1] for x, y in zip(ts['ereignis_sum'], ts['text_sum'])]
ts['zustand_count'] = [meta_anth.query("year == @x and inhaltstyp.str.contains('Zustand')").shape[0] for x in ts.index]
ts['zustand_sum'] = smooth(ts['zustand_count'], mode = 'sum')
ts['zustand_share_smoothed'] = ts['zustand_sum']/ts['text_sum']
ts['zustand_share_smoothed_low'] = [proportion_confint(x, y, 0.1)[0] for x, y in zip(ts['zustand_sum'], ts['text_sum'])]
ts['zustand_share_smoothed_high'] = [proportion_confint(x, y, 0.1)[1] for x, y in zip(ts['zustand_sum'], ts['text_sum'])]
ts['erzaehlen_count'] = [meta_anth.query("year == @x and sprechakte.str.contains('Erzählen')").shape[0] for x in ts.index]
ts['erzaehlen_sum'] = smooth(ts['erzaehlen_count'], mode = 'sum')
ts['erzaehlen_share_smoothed'] = ts['erzaehlen_sum']/ts['text_sum']
ts['erzaehlen_share_smoothed_low'] = [proportion_confint(x, y, 0.1)[0] for x, y in zip(ts['erzaehlen_sum'], ts['text_sum'])]
ts['erzaehlen_share_smoothed_high'] = [proportion_confint(x, y, 0.1)[1] for x, y in zip(ts['erzaehlen_sum'], ts['text_sum'])]
ts['beschreiben_count'] = [meta_anth.query("year == @x and sprechakte.str.contains('Beschreiben')").shape[0] for x in ts.index]
ts['beschreiben_sum'] = smooth(ts['beschreiben_count'], mode = 'sum')
ts['beschreiben_share_smoothed'] = ts['beschreiben_sum']/ts['text_sum']
ts['beschreiben_share_smoothed_low'] = [proportion_confint(x, y, 0.1)[0] for x, y in zip(ts['beschreiben_sum'], ts['text_sum'])]
ts['beschreiben_share_smoothed_high'] = [proportion_confint(x, y, 0.1)[1] for x, y in zip(ts['beschreiben_sum'], ts['text_sum'])]
ts['behaupten_count'] = [meta_anth.query("year == @x and sprechakte.str.contains('Behaupten')").shape[0] for x in ts.index]
ts['behaupten_sum'] = smooth(ts['behaupten_count'], mode = 'sum')
ts['behaupten_share_smoothed'] = ts['behaupten_sum']/ts['text_sum']
ts['behaupten_share_smoothed_low'] = [proportion_confint(x, y, 0.1)[0] for x, y in zip(ts['behaupten_sum'], ts['text_sum'])]
ts['behaupten_share_smoothed_high'] = [proportion_confint(x, y, 0.1)[1] for x, y in zip(ts['behaupten_sum'], ts['text_sum'])]
ts['auffordern_count'] = [meta_anth.query("year == @x and sprechakte.str.contains('Auffordern')").shape[0] for x in ts.index]
ts['auffordern_sum'] = smooth(ts['auffordern_count'], mode = 'sum')
ts['auffordern_share_smoothed'] = ts['auffordern_sum']/ts['text_sum']
ts['auffordern_share_smoothed_low'] = [proportion_confint(x, y, 0.1)[0] for x, y in zip(ts['auffordern_sum'], ts['text_sum'])]
ts['auffordern_share_smoothed_high'] = [proportion_confint(x, y, 0.1)[1] for x, y in zip(ts['auffordern_sum'], ts['text_sum'])]
ts['fragen_count'] = [meta_anth.query("year == @x and sprechakte.str.contains('Fragen')").shape[0] for x in ts.index]
ts['fragen_sum'] = smooth(ts['fragen_count'], mode = 'sum')
ts['fragen_share_smoothed'] = ts['fragen_sum']/ts['text_sum']
ts['fragen_share_smoothed_low'] = [proportion_confint(x, y, 0.1)[0] for x, y in zip(ts['fragen_sum'], ts['text_sum'])]
ts['fragen_share_smoothed_high'] = [proportion_confint(x, y, 0.1)[1] for x, y in zip(ts['fragen_sum'], ts['text_sum'])]
In [12]:
meta_plot = ts[[
'ballade_share_smoothed', 'rollengedicht_share_smoothed',
'lied_share_smoothed', 'nogattung_share_smoothed'
]]
meta_plot.columns = [
'Ballade', 'Rollengedicht',
'Lied', 'Keine Gattung'
]
confint_columns = [
['ballade_share_smoothed_low', 'ballade_share_smoothed_high'],
['rollengedicht_share_smoothed_low', 'rollengedicht_share_smoothed_high'],
['lied_share_smoothed_low', 'lied_share_smoothed_high'],
['nogattung_share_smoothed_low', 'nogattung_share_smoothed_high'],
]
save_ts_data(meta_plot, prefix='06_03_Gattungen_')
fig = create_ts_plot(
data = meta_plot,
columns = meta_plot.columns,
y_axis_title = 'Anteil',
add_corporas = sub_df, add_corpora_names = sub_names,
add_corpora_categories = ['Balladen (Anteil)', 'Rollengedichte (Anteil)', 'Lieder (Anteil)', 'Keine Gattung (Anteil)'],
)
fig = update_fig_for_publication(fig)
fig.write_image(f"plots/6.3.1 Gattungsüberblick.pdf")
fig.show()
Balladenhäufigkeit ohne Münchhausen¶
In [13]:
meta_plot = ts[['ballade_share_smoothed', 'ballade_nomuench_share_smoothed']]
meta_plot.columns = ['Ballade', 'Ballade (ohne Münchhausen)']
fig = create_ts_plot(
data = meta_plot,
columns = meta_plot.columns,
y_axis_title = 'Anteil'
)
fig.show()
Balladenanteil häufigster Autor:innen um 1900 im Anthologiekorpus¶
In [14]:
meta_anth_1895 = meta_anth.query("1895 <= year <= 1905")
meta_anth_1895_balladen = meta_anth_1895.query("gattung.str.contains('Ballade', na = False)")
In [15]:
print(f"Texte 1895–1905 : {meta_anth_1895.shape[0]}")
print(f"Balladen 1895–1905 : {meta_anth_1895_balladen.shape[0]}")
print(f"Balladenanteil 1895–1905 : {meta_anth_1895_balladen.shape[0]/meta_anth_1895.shape[0]}")
Texte 1895–1905 : 127 Balladen 1895–1905 : 61 Balladenanteil 1895–1905 : 0.48031496062992124
In [16]:
top_authors = meta_anth_1895['author'].value_counts()
top_authors_list = top_authors[top_authors >= 5].index.tolist()
for author in top_authors_list:
meta_author = meta_anth_1895.query("author == @author")
meta_author_balladen = meta_author.query("gattung.str.contains('Ballade', na = False)")
print(author)
print(f"Texte : {meta_author.shape[0]}")
print(f"Balladen : {meta_author_balladen.shape[0]}")
print(f"Balladenanteil : {meta_author_balladen.shape[0]/meta_author.shape[0]}")
print("\n")
Schrutz, Demetrius Texte : 9 Balladen : 9 Balladenanteil : 1.0 Miegel, Agnes Texte : 7 Balladen : 6 Balladenanteil : 0.8571428571428571 Münchhausen, Börries von Texte : 7 Balladen : 3 Balladenanteil : 0.42857142857142855 Gaudy, Alice von Texte : 6 Balladen : 5 Balladenanteil : 0.8333333333333334 Greif, Martin Texte : 5 Balladen : 0 Balladenanteil : 0.0 Kunad, Paul Texte : 5 Balladen : 1 Balladenanteil : 0.2
Ballade – Charakteristische Merkmale¶
In [17]:
main_feature = 'ballade'
In [18]:
meta_anth_bin.corr(numeric_only=True)[main_feature].sort_values(ascending = False).head(20)
Out[18]:
ballade 1.000000 sprechakt_erzaehlen_vorhanden 0.615652 ereignis 0.577300 in_hohem_mass_konkret 0.430865 konkretheit 0.422765 wissen_ergaenzend 0.387342 words 0.364751 entity_count 0.341118 bekanntes_individuum_count 0.302330 unbekanntes_individuum_positiv 0.285839 nation_volk_d_negativ 0.264690 kleinraum_count 0.254498 fixierbarkeit 0.249569 entity_negativ 0.198330 entity_positiv 0.194662 liebe_negativ 0.188982 persmarker_vorhanden 0.184839 mittelalter 0.167912 kollektiv_negativ 0.163956 unbekanntes_individuum_count 0.148764 Name: ballade, dtype: float64
In [19]:
meta_anth_bin.corr(numeric_only=True)[main_feature].sort_values(ascending = True).head(20)
Out[19]:
zustand -0.618967 nogenre -0.613281 sprechakt_beschreiben_vorhanden -0.513126 sprechinstanz_markiert -0.456834 sprechakte_count -0.450952 sprechakt_behaupten_vorhanden -0.425517 rollengedicht -0.406373 wissen_identisch -0.382442 sprechinstanz_in_vergangenheit -0.358080 gegenwartsdominant -0.333175 gegenwartsbezug -0.312448 sprechakt_auffordern_vorhanden -0.283006 denkmal -0.223064 nationalismus -0.214500 sprechinstanz_nicht_in_vergangenheit -0.212310 lied -0.206659 nichtmensch_count -0.199440 ueberlieferung -0.188982 decade -0.184349 year -0.171483 Name: ballade, dtype: float64
In [20]:
threshold = 0.2
bin_comp_features = get_features(meta_anth_bin.corr(numeric_only=True)[main_feature], threshold = threshold, mode = 'bin')
bin_comp_features = bin_comp_features + ['heroismus']
cont_comp_features = get_features(meta_anth_bin.corr(numeric_only=True)[main_feature], threshold = threshold, mode = 'cont')
In [21]:
results = relations_binbin(
meta = meta_anth_bin,
main_feature = main_feature,
comp_features = bin_comp_features
)
In [22]:
directly_related = [
'nogenre', 'denkmal', 'lied', 'rollengedicht',
'wissen_ergaenzend', # related to 'wissen_identisch'
'sprechinstanz_nicht_in_vergangenheit', 'sprechinstanz_in_vergangenheit' # related to 'gegenwartsbezug'
]
results_filtered = (
results
.query("index not in @directly_related")
.query("chi2_p < 0.05 and min_expected >= 5 and phi >= @threshold")
.sort_values(by = 'diff', ascending = False)
)
round(results_filtered, 2)
Out[22]:
| wenn_nicht | wenn_nicht_detail | wenn_ja | wenn_ja_detail | diff_low_bootstrap | diff_low | diff | diff_high | diff_high_bootstrap | chi2 | chi2_p | fisher_p | phi | min_real | min_expected | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| sprechakt_erzaehlen_vorhanden | 0.47 | 386/814 | 1.00 | 1034/1036 | 0.49 | 0.49 | 0.52 | 0.56 | 0.56 | 701.20 | 0.0 | 0.0 | 0.62 | 2.0 | 189.20 |
| ereignis | 0.50 | 409/814 | 0.99 | 1024/1036 | 0.45 | 0.45 | 0.49 | 0.52 | 0.52 | 616.56 | 0.0 | 0.0 | 0.58 | 12.0 | 183.48 |
| in_hohem_mass_konkret | 0.63 | 513/814 | 0.97 | 1000/1036 | 0.30 | 0.30 | 0.34 | 0.37 | 0.37 | 343.44 | 0.0 | 0.0 | 0.43 | 36.0 | 148.28 |
| fixierbarkeit | 0.50 | 410/814 | 0.75 | 772/1036 | 0.20 | 0.20 | 0.24 | 0.28 | 0.28 | 115.23 | 0.0 | 0.0 | 0.25 | 264.0 | 293.92 |
| unbekanntes_individuum_positiv | 0.18 | 32/181 | 0.41 | 185/450 | 0.16 | 0.16 | 0.23 | 0.31 | 0.30 | 31.41 | 0.0 | 0.0 | 0.22 | 32.0 | 62.25 |
| nationalismus | 0.22 | 179/814 | 0.07 | 74/1036 | -0.18 | -0.18 | -0.15 | -0.12 | -0.12 | 85.12 | 0.0 | 0.0 | 0.21 | 74.0 | 111.32 |
| sprechakt_auffordern_vorhanden | 0.16 | 131/814 | 0.01 | 10/1036 | -0.18 | -0.18 | -0.15 | -0.13 | -0.13 | 148.17 | 0.0 | 0.0 | 0.28 | 10.0 | 62.04 |
| gegenwartsdominant | 0.28 | 225/814 | 0.04 | 42/1036 | -0.27 | -0.27 | -0.24 | -0.20 | -0.21 | 205.36 | 0.0 | 0.0 | 0.33 | 42.0 | 117.48 |
| wissen_identisch | 0.29 | 239/814 | 0.02 | 25/1036 | -0.30 | -0.30 | -0.27 | -0.24 | -0.24 | 270.58 | 0.0 | 0.0 | 0.38 | 25.0 | 116.16 |
| gegenwartsbezug | 0.45 | 366/814 | 0.16 | 170/1036 | -0.33 | -0.33 | -0.29 | -0.24 | -0.25 | 180.60 | 0.0 | 0.0 | 0.31 | 170.0 | 235.84 |
| sprechakt_behaupten_vorhanden | 0.37 | 304/814 | 0.04 | 41/1036 | -0.37 | -0.37 | -0.33 | -0.30 | -0.30 | 334.97 | 0.0 | 0.0 | 0.43 | 41.0 | 151.80 |
| sprechinstanz_markiert | 0.69 | 565/814 | 0.24 | 246/1036 | -0.50 | -0.50 | -0.46 | -0.42 | -0.42 | 386.09 | 0.0 | 0.0 | 0.46 | 246.0 | 356.84 |
| sprechakt_beschreiben_vorhanden | 0.55 | 444/814 | 0.08 | 82/1036 | -0.50 | -0.50 | -0.47 | -0.43 | -0.43 | 487.10 | 0.0 | 0.0 | 0.51 | 82.0 | 231.44 |
| zustand | 0.76 | 618/814 | 0.14 | 150/1036 | -0.65 | -0.65 | -0.61 | -0.58 | -0.58 | 708.77 | 0.0 | 0.0 | 0.62 | 150.0 | 337.92 |
In [23]:
results_other = results.query("index not in @results_filtered.index")
round(results_other.sort_values(by='diff', ascending=False), 2)
Out[23]:
| wenn_nicht | wenn_nicht_detail | wenn_ja | wenn_ja_detail | diff_low_bootstrap | diff_low | diff | diff_high | diff_high_bootstrap | chi2 | chi2_p | fisher_p | phi | min_real | min_expected | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| wissen_ergaenzend | 0.56 | 454/814 | 0.90 | 929/1036 | 0.30 | 0.30 | 0.34 | 0.38 | 0.38 | 277.56 | 0.00 | 0.00 | 0.39 | 107.0 | 205.48 |
| nation_volk_d_negativ | 0.07 | 3/45 | 0.27 | 3/11 | -0.04 | -0.07 | 0.21 | 0.48 | 0.50 | 3.92 | 0.05 | 0.08 | 0.26 | 3.0 | 1.18 |
| kollektiv_negativ | 0.18 | 87/472 | 0.31 | 181/592 | 0.07 | 0.07 | 0.12 | 0.17 | 0.17 | 20.55 | 0.00 | 0.00 | 0.14 | 87.0 | 118.89 |
| heroismus | 0.22 | 182/814 | 0.33 | 341/1036 | 0.06 | 0.07 | 0.11 | 0.15 | 0.14 | 25.05 | 0.00 | 0.00 | 0.12 | 182.0 | 230.12 |
| entity_negativ | 0.12 | 180/1453 | 0.19 | 464/2463 | 0.04 | 0.04 | 0.06 | 0.09 | 0.09 | 27.67 | 0.00 | 0.00 | 0.08 | 180.0 | 238.95 |
| bekanntes_individuum_negativ | 0.10 | 63/651 | 0.16 | 213/1374 | 0.03 | 0.03 | 0.06 | 0.09 | 0.09 | 12.73 | 0.00 | 0.00 | 0.08 | 63.0 | 88.73 |
| stoffgebiet_negativ | 0.18 | 194/1087 | 0.24 | 342/1453 | 0.02 | 0.03 | 0.06 | 0.09 | 0.09 | 12.09 | 0.00 | 0.00 | 0.07 | 194.0 | 229.38 |
| unbekanntes_individuum_negativ | 0.09 | 16/181 | 0.14 | 65/450 | 0.01 | 0.00 | 0.06 | 0.11 | 0.10 | 3.62 | 0.06 | 0.07 | 0.08 | 16.0 | 23.23 |
| entity_positiv | 0.46 | 673/1453 | 0.49 | 1198/2463 | -0.01 | -0.01 | 0.02 | 0.06 | 0.06 | 1.97 | 0.16 | 0.16 | 0.02 | 673.0 | 694.22 |
| entity_ambivalent | 0.05 | 75/1453 | 0.07 | 169/2463 | 0.00 | 0.00 | 0.02 | 0.03 | 0.03 | 4.52 | 0.03 | 0.03 | 0.03 | 75.0 | 90.53 |
| stoffgebiet_ambivalent | 0.12 | 135/1087 | 0.14 | 203/1453 | -0.01 | -0.01 | 0.02 | 0.04 | 0.04 | 1.30 | 0.25 | 0.26 | 0.02 | 135.0 | 144.65 |
| bekanntes_individuum_positiv | 0.56 | 365/651 | 0.57 | 787/1374 | -0.03 | -0.03 | 0.01 | 0.06 | 0.06 | 0.26 | 0.61 | 0.63 | 0.01 | 286.0 | 280.65 |
| stoffgebiet_neutral | 0.21 | 228/1087 | 0.17 | 254/1453 | -0.07 | -0.07 | -0.03 | -0.00 | -0.00 | 4.94 | 0.03 | 0.03 | 0.04 | 228.0 | 206.27 |
| stoffgebiet_positiv | 0.49 | 530/1087 | 0.45 | 654/1453 | -0.07 | -0.08 | -0.04 | 0.00 | 0.00 | 3.51 | 0.06 | 0.06 | 0.04 | 530.0 | 506.70 |
| kollektiv_positiv | 0.41 | 194/472 | 0.34 | 202/592 | -0.13 | -0.13 | -0.07 | -0.01 | -0.01 | 5.48 | 0.02 | 0.02 | 0.07 | 194.0 | 175.67 |
| denkmal | 0.09 | 72/814 | 0.00 | 1/1036 | -0.11 | -0.11 | -0.09 | -0.07 | -0.07 | 92.05 | 0.00 | 0.00 | 0.22 | 1.0 | 32.12 |
| entity_neutral | 0.36 | 525/1453 | 0.26 | 632/2463 | -0.13 | -0.13 | -0.10 | -0.07 | -0.08 | 48.15 | 0.00 | 0.00 | 0.11 | 525.0 | 429.30 |
| lied | 0.14 | 113/814 | 0.03 | 29/1036 | -0.14 | -0.14 | -0.11 | -0.09 | -0.09 | 79.01 | 0.00 | 0.00 | 0.21 | 29.0 | 62.48 |
| sprechinstanz_nicht_in_vergangenheit | 0.38 | 313/814 | 0.19 | 200/1036 | -0.24 | -0.23 | -0.19 | -0.15 | -0.15 | 83.39 | 0.00 | 0.00 | 0.21 | 200.0 | 225.72 |
| sprechinstanz_in_vergangenheit | 0.31 | 252/814 | 0.04 | 46/1036 | -0.30 | -0.30 | -0.27 | -0.23 | -0.23 | 237.21 | 0.00 | 0.00 | 0.36 | 46.0 | 131.12 |
| rollengedicht | 0.28 | 229/814 | 0.01 | 8/1036 | -0.31 | -0.30 | -0.27 | -0.24 | -0.24 | 305.51 | 0.00 | 0.00 | 0.41 | 8.0 | 104.28 |
| nogenre | 0.52 | 422/814 | 0.00 | 0/1036 | -0.55 | -0.55 | -0.52 | -0.48 | -0.49 | 695.81 | 0.00 | 0.00 | 0.61 | 0.0 | 185.68 |
In [24]:
result_categories = ['wenn_nicht', 'wenn_nicht_detail', 'wenn_ja', 'wenn_ja_detail', 'diff', 'chi2_p', 'phi',]
results_a = relations_binbin(
meta = meta_anth_bin.query("1850 <= year <= 1884"),
main_feature = main_feature,
comp_features = results_filtered.index
)
results_b = relations_binbin(
meta = meta_anth_bin.query("1885 <= year <= 1918"),
main_feature = main_feature,
comp_features = results_filtered.index
)
results_merged = results_a[result_categories].join(
results_b[result_categories],
lsuffix='_1850', rsuffix = '_1885'
)
results_merged['diff_of_diffs'] = results_merged['diff_1885'] - results_merged['diff_1850']
results_merged['diff_of_phis'] = results_merged['phi_1885'] - results_merged['phi_1850']
round(results_merged.sort_values(by = 'diff_of_phis'), 3)
Out[24]:
| wenn_nicht_1850 | wenn_nicht_detail_1850 | wenn_ja_1850 | wenn_ja_detail_1850 | diff_1850 | chi2_p_1850 | phi_1850 | wenn_nicht_1885 | wenn_nicht_detail_1885 | wenn_ja_1885 | wenn_ja_detail_1885 | diff_1885 | chi2_p_1885 | phi_1885 | diff_of_diffs | diff_of_phis | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| sprechakt_auffordern_vorhanden | 0.184 | 98/534 | 0.007 | 6/812 | -0.176 | 0.0 | 0.323 | 0.118 | 33/280 | 0.018 | 4/224 | -0.100 | 0.0 | 0.191 | 0.076 | -0.132 |
| gegenwartsdominant | 0.287 | 153/534 | 0.038 | 31/812 | -0.248 | 0.0 | 0.354 | 0.257 | 72/280 | 0.049 | 11/224 | -0.208 | 0.0 | 0.279 | 0.040 | -0.075 |
| sprechakt_erzaehlen_vorhanden | 0.468 | 250/534 | 0.998 | 810/812 | 0.529 | 0.0 | 0.633 | 0.486 | 136/280 | 1.000 | 224/224 | 0.514 | 0.0 | 0.566 | -0.015 | -0.067 |
| ereignis | 0.515 | 275/534 | 0.991 | 805/812 | 0.476 | 0.0 | 0.585 | 0.479 | 134/280 | 0.978 | 219/224 | 0.499 | 0.0 | 0.541 | 0.023 | -0.044 |
| nationalismus | 0.228 | 122/534 | 0.073 | 59/812 | -0.156 | 0.0 | 0.223 | 0.204 | 57/280 | 0.067 | 15/224 | -0.137 | 0.0 | 0.194 | 0.019 | -0.029 |
| sprechinstanz_markiert | 0.725 | 387/534 | 0.250 | 203/812 | -0.475 | 0.0 | 0.468 | 0.636 | 178/280 | 0.192 | 43/224 | -0.444 | 0.0 | 0.444 | 0.031 | -0.024 |
| gegenwartsbezug | 0.470 | 251/534 | 0.172 | 140/812 | -0.298 | 0.0 | 0.321 | 0.411 | 115/280 | 0.134 | 30/224 | -0.277 | 0.0 | 0.304 | 0.021 | -0.017 |
| fixierbarkeit | 0.519 | 277/534 | 0.756 | 614/812 | 0.237 | 0.0 | 0.246 | 0.475 | 133/280 | 0.705 | 158/224 | 0.230 | 0.0 | 0.232 | -0.007 | -0.014 |
| in_hohem_mass_konkret | 0.620 | 331/534 | 0.958 | 778/812 | 0.338 | 0.0 | 0.434 | 0.650 | 182/280 | 0.991 | 222/224 | 0.341 | 0.0 | 0.425 | 0.003 | -0.010 |
| sprechakt_beschreiben_vorhanden | 0.524 | 280/534 | 0.074 | 60/812 | -0.450 | 0.0 | 0.507 | 0.586 | 164/280 | 0.098 | 22/224 | -0.488 | 0.0 | 0.502 | -0.037 | -0.005 |
| wissen_identisch | 0.288 | 154/534 | 0.027 | 22/812 | -0.261 | 0.0 | 0.379 | 0.304 | 85/280 | 0.013 | 3/224 | -0.290 | 0.0 | 0.380 | -0.029 | 0.001 |
| sprechakt_behaupten_vorhanden | 0.371 | 198/534 | 0.044 | 36/812 | -0.326 | 0.0 | 0.421 | 0.379 | 106/280 | 0.022 | 5/224 | -0.356 | 0.0 | 0.427 | -0.030 | 0.006 |
| zustand | 0.747 | 399/534 | 0.143 | 116/812 | -0.604 | 0.0 | 0.608 | 0.782 | 219/280 | 0.152 | 34/224 | -0.630 | 0.0 | 0.626 | -0.026 | 0.018 |
| unbekanntes_individuum_positiv | 0.234 | 25/107 | 0.430 | 120/279 | 0.196 | 0.0 | 0.182 | 0.095 | 7/74 | 0.380 | 65/171 | 0.286 | 0.0 | 0.288 | 0.089 | 0.106 |
In [25]:
results = relations_bincont(
meta = meta_anth_bin,
main_feature = main_feature,
comp_features = cont_comp_features
)
In [26]:
round(results.sort_values(by = 'pointbiserialr_corr', ascending = False), 2)
Out[26]:
| wenn_nicht | a_merkmal=0 | a_merkmal=1 | a_merkmal=2 | a_merkmal=3 | a_merkmal>=4 | wenn_ja | b_merkmal=0 | b_merkmal=1 | b_merkmal=2 | ... | pointbiserialr_corr | pointbiserialr_p | ttest_p | cohens_d | mannwhitneyu_stat | mannwhitneyu_p | meandiffs_ci_lower | meandiffs_ci_bootstrap_lower | meandiffs_ci_upper | meandiffs_ci_bootstrap_upper | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| konkretheit | 0.80 | 0.03 [25/814] | 0.63 [513/814] | 0.0 [0/814] | 0.0 [0/814] | 0.0 [0/814] | 0.98 | 0.0 [0/1036] | 0.97 [1000/1036] | 0.0 [0/1036] | ... | 0.42 | 0.0 | 0.0 | -0.89 | 279936.0 | 0.0 | 0.16 | 0.16 | 0.20 | 0.20 |
| words | 223.53 | 0.0 [0/761] | 0.0 [0/761] | 0.0 [0/761] | 0.0 [0/761] | 1.0 [761/761] | 383.51 | 0.0 [0/962] | 0.0 [0/962] | 0.0 [0/962] | ... | 0.36 | 0.0 | 0.0 | -0.81 | 176250.5 | 0.0 | 140.68 | 143.14 | 179.29 | 178.46 |
| entity_count | 1.79 | 0.0 [0/814] | 0.41 [334/814] | 0.43 [346/814] | 0.14 [114/814] | 0.02 [20/814] | 2.38 | 0.0 [0/1036] | 0.12 [125/1036] | 0.47 [487/1036] | ... | 0.34 | 0.0 | 0.0 | -0.73 | 257511.0 | 0.0 | 0.52 | 0.52 | 0.67 | 0.67 |
| bekanntes_individuum_count | 0.80 | 0.38 [313/814] | 0.46 [377/814] | 0.13 [102/814] | 0.02 [18/814] | 0.0 [4/814] | 1.33 | 0.16 [170/1036] | 0.43 [447/1036] | 0.33 [340/1036] | ... | 0.30 | 0.0 | 0.0 | -0.64 | 277199.5 | 0.0 | 0.45 | 0.45 | 0.60 | 0.60 |
| kleinraum_count | 0.49 | 0.53 [431/814] | 0.45 [365/814] | 0.02 [17/814] | 0.0 [1/814] | 0.0 [0/814] | 0.79 | 0.28 [287/1036] | 0.66 [688/1036] | 0.05 [56/1036] | ... | 0.25 | 0.0 | 0.0 | -0.53 | 310246.5 | 0.0 | 0.24 | 0.25 | 0.35 | 0.34 |
| sprechakte_count | 1.59 | 0.0 [0/814] | 0.45 [366/814] | 0.51 [413/814] | 0.04 [34/814] | 0.0 [1/814] | 1.13 | 0.0 [0/1036] | 0.87 [904/1036] | 0.13 [131/1036] | ... | -0.45 | 0.0 | 0.0 | 0.99 | 602078.5 | 0.0 | -0.51 | -0.51 | -0.42 | -0.42 |
6 rows × 22 columns
In [27]:
meta_plot = meta_anth_bin.copy()
for cont_comp_feature in cont_comp_features:
mean_main = meta_plot[meta_plot[main_feature] == 1][cont_comp_feature].mean()
mean_notmain = meta_plot[meta_plot[main_feature] == 0][cont_comp_feature].mean()
if cont_comp_feature == 'words':
meta_plot['words'] = meta_plot['words'].clip(upper=1250)
label_main = f"Balladen<br>(Mittelwert = {round(mean_main)})"
label_notmain = f"Nicht-Balladen<br>(Mittelwert = {round(mean_notmain)})"
else:
label_main = f"Balladen<br>(Mittelwert = {round(mean_main, 2)})"
label_notmain = f"Nicht-Balladen<br>(Mittelwert = {round(mean_notmain, 2)})"
meta_plot['plot_legend'] = [label_main if x == 1 else label_notmain for x in meta_plot[main_feature]]
fig = px.histogram(
meta_plot,
x = cont_comp_feature,
color = 'plot_legend',
histnorm = 'probability density',
barmode = 'group',
labels = {'plot_legend' : '',
'konkretheit' : 'Konkretheit',
'entity_count' : 'Anzahl Entitäten',
'words' : 'Anzahl Wörter',
'kleinraum_count' : 'Anzahl Kleinräume',
'bekanntes_individuum_count' : 'Anzahl bekannte Individuen',
'sprechakte_count' : 'Anzahl Sprechakte',
'nichtmensch_count' : 'Anzahl nichtmenschliche Entitäten',
'entity_negativ' : 'Anzahl negativ bewertete Entitäten',
}
)
fig.update_layout(
width = 700, height = 300,
yaxis_title="Anteil",
xaxis=dict(tickfont=dict(size=16), titlefont=dict(size=16)),
yaxis=dict(tickfont=dict(size=16), titlefont=dict(size=16)),
legend=dict(font = dict(size=16), x=0.61, y = 0.88),
bargap=0.1
)
fig = update_fig_for_publication(fig)
fig.write_image(f"plots/6.3.2 Balladen – {cont_comp_feature}.pdf")
fig.show()
In [28]:
result_categories = ['wenn_nicht', 'wenn_ja', 'mannwhitneyu_p', 'pointbiserialr_corr',]
results_a = relations_bincont(
meta = meta_anth_bin.query("1850 <= year <= 1884"),
main_feature = main_feature,
comp_features = cont_comp_features
)
results_b = relations_bincont(
meta = meta_anth_bin.query("1885 <= year <= 1918"),
main_feature = main_feature,
comp_features = cont_comp_features
)
results_merged = results_a[result_categories].join(
results_b[result_categories],
lsuffix='_1850', rsuffix = '_1885'
)
results_merged['diff_of_corrs'] = results_merged['pointbiserialr_corr_1885'] - results_merged['pointbiserialr_corr_1850']
round(results_merged.sort_values(by = 'diff_of_corrs'), 3)
Out[28]:
| wenn_nicht_1850 | wenn_ja_1850 | mannwhitneyu_p_1850 | pointbiserialr_corr_1850 | wenn_nicht_1885 | wenn_ja_1885 | mannwhitneyu_p_1885 | pointbiserialr_corr_1885 | diff_of_corrs | |
|---|---|---|---|---|---|---|---|---|---|
| bekanntes_individuum_count | 0.813 | 1.406 | 0.0 | 0.333 | 0.775 | 1.036 | 0.001 | 0.160 | -0.173 |
| konkretheit | 0.792 | 0.979 | 0.0 | 0.428 | 0.814 | 0.996 | 0.000 | 0.413 | -0.015 |
| entity_count | 1.833 | 2.405 | 0.0 | 0.327 | 1.693 | 2.277 | 0.000 | 0.337 | 0.010 |
| sprechakte_count | 1.584 | 1.126 | 0.0 | -0.449 | 1.614 | 1.138 | 0.000 | -0.438 | 0.011 |
| kleinraum_count | 0.541 | 0.799 | 0.0 | 0.223 | 0.404 | 0.754 | 0.000 | 0.295 | 0.072 |
| words | 236.739 | 393.223 | 0.0 | 0.333 | 200.706 | 351.327 | 0.000 | 0.423 | 0.090 |
Textlänge in Wörtern¶
In [29]:
meta_plot = meta_anth_bin.copy()
# meta_plot['words'] = meta_plot['words'].clip(upper=1000)
fig = px.box(
meta_plot,
x = 'decade',
y = 'words',
hover_data = ['author_title'],
labels = {'decade' : 'Dekade', 'words' : 'Textlänge in Wörtern'}
)
fig.show()
Inhaltstyp¶
In [30]:
meta_plot = ts[['ereignis_share_smoothed', 'zustand_share_smoothed']]
meta_plot.columns = ['Ereignis', 'Zustand']
fig = create_ts_plot(
data = meta_plot,
columns = meta_plot.columns,
y_axis_title = 'Anteil',
add_corporas = sub_df, add_corpora_names = sub_names,
add_corpora_categories = ['Ereignis (Anteil)', 'Zustand (Anteil)'],
)
fig.show()
Sprechakte¶
In [31]:
meta_plot = ts[[
'erzaehlen_share_smoothed', 'beschreiben_share_smoothed',
'behaupten_share_smoothed', 'auffordern_share_smoothed',
'fragen_share_smoothed'
]]
meta_plot.columns = [
'Erzählen', 'Beschreiben', 'Behaupten', 'Auffordern', 'Fragen'
]
fig = create_ts_plot(
data = meta_plot,
columns = meta_plot.columns,
y_axis_title = 'Anteil',
add_corporas = sub_df, add_corpora_names = sub_names,
add_corpora_categories = ['Erzählen (Anteil)', 'Beschreiben (Anteil)', 'Behaupten (Anteil)', 'Auffordern (Anteil)', 'Fragen (Anteil)'],
)
fig.show()
Rollengedicht – Charakteristische Merkmale¶
In [32]:
main_feature = 'rollengedicht'
In [33]:
meta_anth_bin.corr(numeric_only=True)[main_feature].sort_values(ascending = False).head(20)
Out[33]:
rollengedicht 1.000000 sprechinstanz_in_vergangenheit 0.791177 sprechinstanz_markiert 0.433865 geschichtsauffassung_negativ 0.429320 sprechakt_beschreiben_vorhanden 0.285456 zustand 0.248189 sprechakt_auffordern_vorhanden 0.231238 lied 0.223636 sprechakte_count 0.214174 entity_neutral 0.200063 sprechakt_behaupten_vorhanden 0.144516 stoffgebiet_neutral 0.143268 wissen_ergaenzend 0.122218 antike 0.100717 politik_negativ 0.099161 religion_negativ 0.098338 year_predict_ages_mean 0.097129 decade 0.096419 year 0.086274 krieg_negativ 0.082220 Name: rollengedicht, dtype: float64
In [34]:
meta_anth_bin.corr(numeric_only=True)[main_feature].sort_values(ascending = True).head(20)
Out[34]:
ballade -0.406373 sprechakt_erzaehlen_vorhanden -0.328975 bekanntes_individuum_positiv -0.277209 entity_positiv -0.253425 ereignis -0.249965 religion_positiv -0.237342 nogenre -0.208377 geschichtsauffassung_positiv -0.207562 krieg_positiv -0.203894 heroismus -0.186768 nation_volk_d_positiv -0.181592 gegenwartsbezug -0.173510 sprechinstanz_nicht_in_vergangenheit -0.168793 stoffgebiet_positiv -0.162961 politik_positiv -0.160560 unbekanntes_individuum_positiv -0.153061 wissen -0.149309 wissen_identisch -0.142517 fixierbarkeit -0.139487 bekanntes_individuum_count -0.137366 Name: rollengedicht, dtype: float64
In [35]:
threshold = 0.2
bin_comp_features = get_features(meta_anth_bin.corr(numeric_only=True)[main_feature], threshold = threshold, mode = 'bin')
bin_comp_features = bin_comp_features + [
'entity_positiv', 'entity_negativ',
'stoffgebiet_neutral', 'stoffgebiet_positiv', 'stoffgebiet_negativ',
'religion', 'nationalismus'
]
cont_comp_features = get_features(meta_anth_bin.corr(numeric_only=True)[main_feature], threshold = threshold, mode = 'cont')
In [36]:
results = relations_binbin(
meta = meta_anth_bin,
main_feature = main_feature,
comp_features = bin_comp_features
)
In [37]:
directly_related = ['nogenre', 'ballade', 'lied']
results_filtered = (
results
.query("index not in @directly_related")
.query("chi2_p < 0.05 and min_expected >= 5 and phi >= @threshold")
.sort_values(by = 'diff', ascending = False)
)
round(results_filtered, 2)
Out[37]:
| wenn_nicht | wenn_nicht_detail | wenn_ja | wenn_ja_detail | diff_low_bootstrap | diff_low | diff | diff_high | diff_high_bootstrap | chi2 | chi2_p | fisher_p | phi | min_real | min_expected | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| sprechinstanz_in_vergangenheit | 0.05 | 80/1613 | 0.92 | 218/237 | 0.83 | 0.83 | 0.87 | 0.91 | 0.91 | 1158.03 | 0.0 | 0.0 | 0.79 | 19.0 | 38.18 |
| sprechinstanz_markiert | 0.36 | 574/1613 | 1.00 | 237/237 | 0.62 | 0.62 | 0.64 | 0.67 | 0.67 | 348.24 | 0.0 | 0.0 | 0.43 | 0.0 | 103.90 |
| sprechakt_beschreiben_vorhanden | 0.23 | 379/1613 | 0.62 | 147/237 | 0.32 | 0.32 | 0.39 | 0.45 | 0.45 | 150.75 | 0.0 | 0.0 | 0.29 | 90.0 | 67.38 |
| zustand | 0.37 | 594/1613 | 0.73 | 174/237 | 0.31 | 0.30 | 0.37 | 0.43 | 0.43 | 113.96 | 0.0 | 0.0 | 0.25 | 63.0 | 98.39 |
| entity_neutral | 0.26 | 904/3469 | 0.57 | 253/447 | 0.26 | 0.26 | 0.31 | 0.35 | 0.36 | 177.42 | 0.0 | 0.0 | 0.21 | 194.0 | 132.07 |
| sprechakt_auffordern_vorhanden | 0.05 | 85/1613 | 0.24 | 56/237 | 0.13 | 0.13 | 0.18 | 0.24 | 0.24 | 98.92 | 0.0 | 0.0 | 0.23 | 56.0 | 18.06 |
| krieg_positiv | 0.57 | 395/699 | 0.26 | 25/98 | -0.40 | -0.40 | -0.31 | -0.22 | -0.22 | 33.13 | 0.0 | 0.0 | 0.20 | 25.0 | 46.36 |
| ereignis | 0.81 | 1314/1613 | 0.50 | 119/237 | -0.38 | -0.38 | -0.31 | -0.25 | -0.25 | 115.59 | 0.0 | 0.0 | 0.25 | 118.0 | 53.42 |
| religion_positiv | 0.55 | 124/226 | 0.22 | 9/41 | -0.47 | -0.47 | -0.33 | -0.19 | -0.19 | 15.04 | 0.0 | 0.0 | 0.24 | 9.0 | 20.42 |
| sprechakt_erzaehlen_vorhanden | 0.82 | 1324/1613 | 0.41 | 96/237 | -0.48 | -0.48 | -0.42 | -0.35 | -0.35 | 200.22 | 0.0 | 0.0 | 0.33 | 96.0 | 55.09 |
In [38]:
results_other = results.query("index not in @results_filtered.index")
round(results_other.sort_values(by='diff', ascending=False), 4)
Out[38]:
| wenn_nicht | wenn_nicht_detail | wenn_ja | wenn_ja_detail | diff_low_bootstrap | diff_low | diff | diff_high | diff_high_bootstrap | chi2 | chi2_p | fisher_p | phi | min_real | min_expected | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| geschichtsauffassung_negativ | 0.1690 | 12/71 | 0.8333 | 5/6 | 0.3028 | 0.3536 | 0.6643 | 0.9750 | 0.8873 | 14.1923 | 0.0002 | 0.0016 | 0.4293 | 1.0 | 1.3247 |
| lied | 0.0539 | 87/1613 | 0.2321 | 55/237 | 0.1235 | 0.1233 | 0.1781 | 0.2330 | 0.2332 | 92.5243 | 0.0000 | 0.0000 | 0.2236 | 55.0 | 18.1914 |
| stoffgebiet_neutral | 0.1688 | 373/2210 | 0.3303 | 109/330 | 0.1077 | 0.1084 | 0.1615 | 0.2146 | 0.2146 | 48.7221 | 0.0000 | 0.0000 | 0.1385 | 109.0 | 62.6220 |
| stoffgebiet_negativ | 0.2054 | 454/2210 | 0.2485 | 82/330 | -0.0060 | -0.0065 | 0.0431 | 0.0926 | 0.0920 | 3.1969 | 0.0738 | 0.0824 | 0.0355 | 82.0 | 69.6378 |
| stoffgebiet_ambivalent | 0.1285 | 284/2210 | 0.1636 | 54/330 | -0.0069 | -0.0072 | 0.0351 | 0.0774 | 0.0789 | 3.0715 | 0.0797 | 0.0825 | 0.0348 | 54.0 | 43.9134 |
| religion | 0.1401 | 226/1613 | 0.1730 | 41/237 | -0.0183 | -0.0182 | 0.0329 | 0.0839 | 0.0855 | 1.8094 | 0.1786 | 0.1976 | 0.0313 | 41.0 | 34.2049 |
| entity_ambivalent | 0.0628 | 218/3469 | 0.0582 | 26/447 | -0.0271 | -0.0278 | -0.0047 | 0.0185 | 0.0193 | 0.1482 | 0.7002 | 0.7560 | 0.0062 | 26.0 | 27.8519 |
| bekanntes_individuum_negativ | 0.1381 | 254/1839 | 0.1183 | 22/186 | -0.0667 | -0.0689 | -0.0198 | 0.0292 | 0.0281 | 0.5648 | 0.4524 | 0.5023 | 0.0167 | 22.0 | 25.3511 |
| entity_negativ | 0.1683 | 584/3469 | 0.1342 | 60/447 | -0.0659 | -0.0681 | -0.0341 | -0.0002 | 0.0032 | 3.3549 | 0.0670 | 0.0674 | 0.0293 | 60.0 | 73.5107 |
| kollektiv_negativ | 0.2575 | 240/932 | 0.2121 | 28/132 | -0.1222 | -0.1206 | -0.0454 | 0.0298 | 0.0292 | 1.2641 | 0.2609 | 0.2852 | 0.0345 | 28.0 | 33.2481 |
| unbekanntes_individuum_negativ | 0.1407 | 74/526 | 0.0667 | 7/105 | -0.1236 | -0.1302 | -0.0740 | -0.0178 | -0.0150 | 4.2858 | 0.0384 | 0.0380 | 0.0824 | 7.0 | 13.4786 |
| nationalismus | 0.1463 | 236/1613 | 0.0717 | 17/237 | -0.1097 | -0.1117 | -0.0746 | -0.0375 | -0.0351 | 9.7362 | 0.0018 | 0.0011 | 0.0725 | 17.0 | 32.4114 |
| kollektiv_positiv | 0.3906 | 364/932 | 0.2424 | 32/132 | -0.2281 | -0.2277 | -0.1481 | -0.0686 | -0.0691 | 10.8584 | 0.0010 | 0.0010 | 0.1010 | 32.0 | 49.1278 |
| unbekanntes_individuum_positiv | 0.3745 | 197/526 | 0.1905 | 20/105 | -0.2640 | -0.2698 | -0.1840 | -0.0983 | -0.0945 | 13.1404 | 0.0003 | 0.0003 | 0.1443 | 20.0 | 36.1094 |
| stoffgebiet_positiv | 0.4973 | 1099/2210 | 0.2576 | 85/330 | -0.2924 | -0.2913 | -0.2397 | -0.1881 | -0.1924 | 66.2977 | 0.0000 | 0.0000 | 0.1616 | 85.0 | 153.8268 |
| nogenre | 0.2616 | 422/1613 | 0.0000 | 0/237 | -0.2839 | -0.2831 | -0.2616 | -0.2402 | -0.2399 | 80.3286 | 0.0000 | 0.0000 | 0.2084 | 0.0 | 54.0616 |
| entity_positiv | 0.5082 | 1763/3469 | 0.2416 | 108/447 | -0.3102 | -0.3096 | -0.2666 | -0.2236 | -0.2212 | 112.8038 | 0.0000 | 0.0000 | 0.1697 | 108.0 | 213.5692 |
| bekanntes_individuum_positiv | 0.6003 | 1104/1839 | 0.2581 | 48/186 | -0.4099 | -0.4090 | -0.3423 | -0.2755 | -0.2805 | 80.6807 | 0.0000 | 0.0000 | 0.1996 | 48.0 | 80.1867 |
| geschichtsauffassung_positiv | 0.3662 | 26/71 | 0.0000 | 0/6 | -0.4789 | -0.4783 | -0.3662 | -0.2541 | -0.2535 | 3.3173 | 0.0686 | 0.0911 | 0.2076 | 0.0 | 2.0260 |
| ballade | 0.6373 | 1028/1613 | 0.0338 | 8/237 | -0.6354 | -0.6364 | -0.6036 | -0.5707 | -0.5668 | 305.5065 | 0.0000 | 0.0000 | 0.4064 | 8.0 | 104.2800 |
In [39]:
result_categories = ['wenn_nicht', 'wenn_nicht_detail', 'wenn_ja', 'wenn_ja_detail', 'diff', 'chi2_p', 'phi',]
results_a = relations_binbin(
meta = meta_anth_bin.query("1850 <= year <= 1889"),
main_feature = main_feature,
comp_features = [x for x in results_filtered.index]
)
results_b = relations_binbin(
meta = meta_anth_bin.query("1890 <= year <= 1918"),
main_feature = main_feature,
comp_features = [x for x in results_filtered.index]
)
results_merged = results_a[result_categories].join(
results_b[result_categories],
lsuffix='_1850', rsuffix = '_1885'
)
results_merged['diff_of_diffs'] = results_merged['diff_1885'] - results_merged['diff_1850']
results_merged['diff_of_phis'] = results_merged['phi_1885'] - results_merged['phi_1850']
round(results_merged.sort_values(by = 'diff_of_phis'), 3)
Out[39]:
| wenn_nicht_1850 | wenn_nicht_detail_1850 | wenn_ja_1850 | wenn_ja_detail_1850 | diff_1850 | chi2_p_1850 | phi_1850 | wenn_nicht_1885 | wenn_nicht_detail_1885 | wenn_ja_1885 | wenn_ja_detail_1885 | diff_1885 | chi2_p_1885 | phi_1885 | diff_of_diffs | diff_of_phis | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ereignis | 0.838 | 1073/1281 | 0.483 | 85/176 | -0.355 | 0.0 | 0.286 | 0.726 | 241/332 | 0.557 | 34/61 | -0.169 | 0.008 | 0.133 | 0.186 | -0.153 |
| sprechakt_erzaehlen_vorhanden | 0.838 | 1074/1281 | 0.392 | 69/176 | -0.446 | 0.0 | 0.354 | 0.753 | 250/332 | 0.443 | 27/61 | -0.310 | 0.000 | 0.246 | 0.136 | -0.107 |
| religion_positiv | 0.569 | 116/204 | 0.219 | 7/32 | -0.350 | 0.0 | 0.240 | 0.364 | 8/22 | 0.222 | 2/9 | -0.141 | 0.445 | 0.137 | 0.208 | -0.102 |
| sprechakt_beschreiben_vorhanden | 0.210 | 269/1281 | 0.625 | 110/176 | 0.415 | 0.0 | 0.308 | 0.331 | 110/332 | 0.607 | 37/61 | 0.275 | 0.000 | 0.206 | -0.140 | -0.102 |
| sprechakt_auffordern_vorhanden | 0.052 | 67/1281 | 0.256 | 45/176 | 0.203 | 0.0 | 0.249 | 0.054 | 18/332 | 0.180 | 11/61 | 0.126 | 0.001 | 0.175 | -0.077 | -0.074 |
| zustand | 0.346 | 443/1281 | 0.739 | 130/176 | 0.393 | 0.0 | 0.262 | 0.455 | 151/332 | 0.721 | 44/61 | 0.266 | 0.000 | 0.193 | -0.126 | -0.069 |
| sprechinstanz_in_vergangenheit | 0.049 | 63/1281 | 0.909 | 160/176 | 0.860 | 0.0 | 0.778 | 0.051 | 17/332 | 0.951 | 58/61 | 0.900 | 0.000 | 0.829 | 0.040 | 0.051 |
| sprechinstanz_markiert | 0.363 | 465/1281 | 1.000 | 176/176 | 0.637 | 0.0 | 0.418 | 0.328 | 109/332 | 1.000 | 61/61 | 0.672 | 0.000 | 0.491 | 0.035 | 0.073 |
| krieg_positiv | 0.588 | 316/537 | 0.308 | 20/65 | -0.281 | 0.0 | 0.175 | 0.488 | 79/162 | 0.152 | 5/33 | -0.336 | 0.000 | 0.255 | -0.055 | 0.079 |
| entity_neutral | 0.247 | 695/2814 | 0.520 | 178/342 | 0.273 | 0.0 | 0.190 | 0.319 | 209/655 | 0.714 | 75/105 | 0.395 | 0.000 | 0.282 | 0.122 | 0.092 |
In [40]:
# nochmal, aber ohne Balladen unter Nicht-Rollengedichten
results = relations_binbin(
meta = meta_anth_bin.query("ballade == False or rollengedicht == True"),
main_feature = main_feature,
comp_features = results_filtered.index
)
In [41]:
round(results, 2)
Out[41]:
| wenn_nicht | wenn_nicht_detail | wenn_ja | wenn_ja_detail | diff_low_bootstrap | diff_low | diff | diff_high | diff_high_bootstrap | chi2 | chi2_p | fisher_p | phi | min_real | min_expected | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| sprechinstanz_in_vergangenheit | 0.07 | 39/585 | 0.92 | 218/237 | 0.81 | 0.81 | 0.85 | 0.89 | 0.89 | 571.29 | 0.00 | 0.00 | 0.83 | 19.0 | 74.10 |
| sprechinstanz_markiert | 0.57 | 336/585 | 1.00 | 237/237 | 0.39 | 0.39 | 0.43 | 0.47 | 0.47 | 144.71 | 0.00 | 0.00 | 0.42 | 0.0 | 71.79 |
| sprechakt_beschreiben_vorhanden | 0.51 | 299/585 | 0.62 | 147/237 | 0.03 | 0.04 | 0.11 | 0.18 | 0.18 | 8.10 | 0.00 | 0.01 | 0.10 | 90.0 | 108.41 |
| zustand | 0.76 | 444/585 | 0.73 | 174/237 | -0.09 | -0.09 | -0.02 | 0.04 | 0.04 | 0.56 | 0.46 | 0.48 | 0.03 | 63.0 | 58.82 |
| entity_neutral | 0.28 | 288/1029 | 0.57 | 253/447 | 0.23 | 0.23 | 0.29 | 0.34 | 0.34 | 109.87 | 0.00 | 0.00 | 0.27 | 194.0 | 163.84 |
| sprechakt_auffordern_vorhanden | 0.13 | 75/585 | 0.24 | 56/237 | 0.04 | 0.05 | 0.11 | 0.17 | 0.17 | 14.71 | 0.00 | 0.00 | 0.13 | 56.0 | 37.77 |
| krieg_positiv | 0.63 | 139/219 | 0.26 | 25/98 | -0.48 | -0.49 | -0.38 | -0.27 | -0.28 | 39.07 | 0.00 | 0.00 | 0.35 | 25.0 | 47.30 |
| ereignis | 0.51 | 298/585 | 0.50 | 119/237 | -0.08 | -0.08 | -0.01 | 0.07 | 0.07 | 0.04 | 0.85 | 0.88 | 0.01 | 118.0 | 116.77 |
| religion_positiv | 0.59 | 34/58 | 0.22 | 9/41 | -0.54 | -0.55 | -0.37 | -0.19 | -0.17 | 13.15 | 0.00 | 0.00 | 0.36 | 9.0 | 17.81 |
| sprechakt_erzaehlen_vorhanden | 0.51 | 298/585 | 0.41 | 96/237 | -0.18 | -0.18 | -0.10 | -0.03 | -0.03 | 7.36 | 0.01 | 0.01 | 0.09 | 96.0 | 113.60 |
In [42]:
results = relations_bincont(
meta = meta_anth_bin,
main_feature = main_feature,
comp_features = cont_comp_features
)
In [43]:
round(results.sort_values(by = 'pointbiserialr_corr', ascending = False), 2)
Out[43]:
| wenn_nicht | a_merkmal=0 | a_merkmal=1 | a_merkmal=2 | a_merkmal=3 | a_merkmal>=4 | wenn_ja | b_merkmal=0 | b_merkmal=1 | b_merkmal=2 | ... | pointbiserialr_corr | pointbiserialr_p | ttest_p | cohens_d | mannwhitneyu_stat | mannwhitneyu_p | meandiffs_ci_lower | meandiffs_ci_bootstrap_lower | meandiffs_ci_upper | meandiffs_ci_bootstrap_upper | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| sprechakte_count | 1.29 | 0.0 [0/1613] | 0.72 [1169/1613] | 0.26 [419/1613] | 0.01 [24/1613] | 0.0 [1/1613] | 1.62 | 0.0 [0/237] | 0.43 [101/237] | 0.53 [125/237] | ... | 0.21 | 0.0 | 0.0 | -0.62 | 133334.0 | 0.0 | 0.26 | 0.25 | 0.4 | 0.41 |
1 rows × 22 columns
In [44]:
meta_plot = meta_anth_bin.copy()
for cont_comp_feature in cont_comp_features:
mean_main = meta_plot[meta_plot[main_feature] == 1][cont_comp_feature].mean()
mean_notmain = meta_plot[meta_plot[main_feature] == 0][cont_comp_feature].mean()
label_main = f"Rollengedichte<br>(Mittelwert = {round(mean_main, 2)})"
label_notmain = f"Nicht-Rollengedichte<br>(Mittelwert = {round(mean_notmain, 2)})"
meta_plot['plot_legend'] = [label_main if x == 1 else label_notmain for x in meta_plot[main_feature]]
meta_plot['words'] = meta_plot['words'].clip(upper=1250)
fig = px.histogram(
meta_plot.sort_values(by='plot_legend', ascending=False),
x = cont_comp_feature,
color = 'plot_legend',
histnorm = 'probability density',
barmode = 'group',
labels = {'plot_legend' : '',
'entity_neutral' : 'Anzahl neutral bewertete Entitäten',
'entity_positiv' : 'Anzahl positiv bewertete Entitäten',
'sprechakte_count' : 'Anzahl Sprechakte',
}
)
fig.update_layout(
width = 700, height = 300,
yaxis_title="Anteil",
xaxis=dict(tickfont=dict(size=16), titlefont=dict(size=16)),
yaxis=dict(tickfont=dict(size=16), titlefont=dict(size=16)),
legend=dict(font = dict(size=16), x=0.59, y = 0.88),
bargap=0.1
)
fig = update_fig_for_publication(fig)
fig.write_image(f"plots/6.3.2 Rollengedichte – {cont_comp_feature}.pdf")
fig.show()
In [45]:
result_categories = ['wenn_nicht', 'wenn_ja', 'mannwhitneyu_p', 'pointbiserialr_corr',]
results_a = relations_bincont(
meta = meta_anth_bin.query("1850 <= year <= 1884"),
main_feature = main_feature,
comp_features = cont_comp_features
)
results_b = relations_bincont(
meta = meta_anth_bin.query("1885 <= year <= 1918"),
main_feature = main_feature,
comp_features = cont_comp_features
)
results_merged = results_a[result_categories].join(
results_b[result_categories],
lsuffix='_1850', rsuffix = '_1885'
)
results_merged['diff_of_corrs'] = results_merged['pointbiserialr_corr_1885'] - results_merged['pointbiserialr_corr_1850']
round(results_merged.sort_values(by = 'diff_of_corrs'), 3)
Out[45]:
| wenn_nicht_1850 | wenn_ja_1850 | mannwhitneyu_p_1850 | pointbiserialr_corr_1850 | wenn_nicht_1885 | wenn_ja_1885 | mannwhitneyu_p_1885 | pointbiserialr_corr_1885 | diff_of_corrs | |
|---|---|---|---|---|---|---|---|---|---|
| sprechakte_count | 1.265 | 1.62 | 0.0 | 0.232 | 1.365 | 1.622 | 0.0 | 0.168 | -0.064 |